#Load .RData
load('.RData')
load('trips_2015.RData')
# clean 2015 data and add other predictors
trips_per_day_2015 <- trips_per_day_2015 %>%
left_join(holiday, by = "ymd") %>%
mutate(is_holiday = !(is.na(holiday_name)),
weekday = weekdays(as.POSIXct(ymd), abbreviate = T),
month = format(ymd,"%m"),
is_flu_season = month %in% flu_season) %>%
select(-holiday_name, -day_num, -month) %>%
rename(num_trips = trip_num)
# divide tmin and tmax by 10(same as 2014)
trips_per_day_2015$tmin <-trips_per_day_2015$tmin/10
trips_per_day_2015$tmax <-trips_per_day_2015$tmax/10
\(R^2\) is 0.72, which is much lower than what was expected.
test_R_square <- rsquare(model, trips_per_day_2015)
test_R_square
## [1] 0.7245831
rmse(model, trips_per_day_2015)
## [1] 8036.401
#Add predictions
plot_test_data<- trips_per_day_2015 %>%
add_predictions(model)
ggplotly(ggplot(plot_test_data, aes(x= ymd, y = pred))+
geom_point(aes(y= num_trips))+
geom_line(aes(y=pred), color = "red")+
geom_point(aes(y=pred), color = "red") +
geom_smooth() +
xlab("Date") +
ylab("Predicted (in red)/ Actual (in black)")+
ggtitle("Number of trips at different dates"))
ggplot(plot_test_data, aes(x=pred, y =num_trips ))+
geom_point()+
geom_abline(linetype = "dashed") +
xlab('Predicted') +
ylab('Actual')
Number of trips in mid-September through early November were unusually high compare to 2014. This explains why the R^2 is 0.7245831 when fitting the model to 2015 data.
# plot_train_data contains data clean data from 2014
plot_train_data<- rbind(train_data,validate_data, test_data)
plot_train_data <- plot_train_data %>%
add_predictions(model) %>%
filter(ymd!="2014-04-30") %>%
mutate(Year = "2014")# add label for plotting
# plot_test_data contains clean data from 2015
plot_test_data <- plot_test_data %>%
mutate(Year = "2015")
#combine all data
plot_all_data <-rbind(plot_train_data, plot_test_data)
# plot 2014 and 2015 with predicted values
ggplotly(ggplot(plot_all_data, aes(x= ymd, y = pred, color = Year),alpha=0.7)+
geom_point(aes(y= num_trips))+
geom_line(aes(y=pred),color = "blue", alpha=0.4)+
geom_point(aes(y=pred),color = "blue",alpha=0.4)+
geom_smooth() +
xlab("Date") +
ylab("Predicted (in blue)/ Actual")+
ggtitle("Number of trips at different dates"))
# plot predicted values vs actual values
ggplot(plot_all_data, aes(x=pred, y =num_trips, color =Year ))+
geom_point()+
geom_abline(linetype = "dashed") +
xlab('Predicted') +
ylab('Actual')
# use model to predict data from 2014
rsquare(model, plot_train_data)
## [1] 0.9043868
rmse(model, plot_train_data)
## [1] 3156.534
My model performed much worse than what I expected. Number of trips in 2015 had a different trend compare to 2014’s. There are many unexpected factors that may have great impact on the number of trips, such as social media trends, unexpected severe weather conditions, promotions on healthy life styles. When running the 2015 data, I first got negative R^2. I checked the data and found tmin and tmax were much larger than 2014. Krushang told me that those values were divided by 10 in year 2014. Other than that, I did not face much difficulty when running the model on 2015 data and Krushang’s code and Basira’s code. Everything went smoothly.